# load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(brms)  # bayesian regression
## Loading required package: Rcpp
## Loading 'brms' package (version 2.22.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## 
## Attaching package: 'brms'
## 
## The following object is masked from 'package:stats':
## 
##     ar
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(tidyverse)
# read data
flight_data <- read.csv("flight_data_cleaned.csv")
# delete rows containing NA in the x variables
flight_data <- flight_data %>%
  drop_na(Fare, Number.Of.Stops, Total_Minutes, distance,
          IsWeekend, ifHoliday, Is_Low_Cost, Low_Cost_Count,
          Departure.Off.Peak, Arrival.Off.Peak)

mean_fare = mean(flight_data$Fare)
sd_fare = sd(flight_data$Fare)
cat(mean_fare,"\n")
## 23769.17
cat(sd_fare,"\n")
## 20344.6

1. Basic models

1.1 linear regression

# basic linear regression model
lm_basic <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
               IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
               Departure.Off.Peak + Arrival.Off.Peak,
               data = flight_data)

# summary
summary(lm_basic)
## 
## Call:
## lm(formula = Fare ~ Number.Of.Stops + Total_Minutes + distance + 
##     IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak, data = flight_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58312  -8929  -2390   4275  71496 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.610e+03  3.365e+02  19.642  < 2e-16 ***
## Number.Of.Stops     1.101e+03  2.142e+02   5.139 2.79e-07 ***
## Total_Minutes       5.858e+00  2.215e-01  26.454  < 2e-16 ***
## distance            3.415e+00  3.258e-02 104.804  < 2e-16 ***
## IsWeekend           1.351e+03  1.919e+02   7.037 2.01e-12 ***
## ifHoliday          -1.657e+03  4.178e+02  -3.967 7.29e-05 ***
## Is_Low_Cost        -4.486e+03  8.700e+02  -5.156 2.54e-07 ***
## Low_Cost_Count     -1.960e+03  4.291e+02  -4.567 4.96e-06 ***
## Departure.Off.Peak  6.226e+02  1.965e+02   3.168  0.00154 ** 
## Arrival.Off.Peak    3.167e+03  2.015e+02  15.718  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14930 on 27438 degrees of freedom
## Multiple R-squared:  0.4617, Adjusted R-squared:  0.4615 
## F-statistic:  2615 on 9 and 27438 DF,  p-value: < 2.2e-16

1.2 bayesian regression

# basic bayesian regression model
bayes_model <- brm(
  Fare ~ Number.Of.Stops + Total_Minutes + distance + 
  IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
  Departure.Off.Peak + Arrival.Off.Peak,
  data = flight_data,
  family = gaussian(),
  chains = 4,
  iter = 2000
)
## Compiling Stan program...
## Start sampling
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 9.6e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 8.876 seconds (Warm-up)
## Chain 1:                2.137 seconds (Sampling)
## Chain 1:                11.013 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 9e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.9 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 11.301 seconds (Warm-up)
## Chain 2:                2.165 seconds (Sampling)
## Chain 2:                13.466 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 8.9e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.89 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 10.328 seconds (Warm-up)
## Chain 3:                2.351 seconds (Sampling)
## Chain 3:                12.679 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 8.9e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.89 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 13.095 seconds (Warm-up)
## Chain 4:                2.302 seconds (Sampling)
## Chain 4:                15.397 seconds (Total)
## Chain 4:
# summary of bayesian regression model and bayesian R squared
summary(bayes_model)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + Arrival.Off.Peak 
##    Data: flight_data (Number of observations: 27448) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Regression Coefficients:
##                    Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept           6606.60    332.55  5949.19  7263.32 1.00     4409     3273
## Number.Of.Stops     1104.37    215.97   672.38  1523.31 1.00     3127     3100
## Total_Minutes          5.86      0.22     5.40     6.29 1.00     3474     2692
## distance               3.41      0.03     3.35     3.48 1.00     3999     2625
## IsWeekend           1349.16    192.54   974.83  1733.95 1.00     4132     2987
## ifHoliday          -1659.98    413.90 -2476.49  -854.96 1.00     3939     2939
## Is_Low_Cost        -4466.86    851.76 -6107.22 -2782.98 1.00     2302     2529
## Low_Cost_Count     -1967.62    420.40 -2800.33 -1142.59 1.00     2307     2770
## Departure.Off.Peak   623.77    191.43   260.79   999.91 1.00     4134     3008
## Arrival.Off.Peak    3168.78    206.44  2763.71  3571.59 1.00     4048     2793
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 14929.46     64.76 14800.94 15056.29 1.00     6467     3043
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
bayes_R2(bayes_model)
##     Estimate   Est.Error      Q2.5     Q97.5
## R2 0.4616525 0.003250939 0.4552978 0.4679927

2. Models with interaction terms

2.1 weekend and holiday

# linear regression model with interaction term 1
lm_interaction1 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                    Departure.Off.Peak + Arrival.Off.Peak+
                    IsWeekend:ifHoliday, # weekend and holiday
                    data = flight_data)

# compare model with interaction1 with basic model
anova(lm_basic, lm_interaction1)
## Analysis of Variance Table
## 
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak + IsWeekend:ifHoliday
##   Res.Df        RSS Df Sum of Sq      F  Pr(>F)  
## 1  27438 6.1152e+12                              
## 2  27437 6.1144e+12  1 806583409 3.6194 0.05712 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.2 low-cost operators and distance

# linear regression model with interaction term 2
lm_interaction2 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                    Departure.Off.Peak + Arrival.Off.Peak+
                    Is_Low_Cost:distance,  # Differences in pricing strategies of low-cost operators across various distances
                    data = flight_data)

# compare model with interaction2 with basic model
anova(lm_basic, lm_interaction2)
## Analysis of Variance Table
## 
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak + Is_Low_Cost:distance
##   Res.Df        RSS Df  Sum of Sq     F    Pr(>F)    
## 1  27438 6.1152e+12                                  
## 2  27437 6.0416e+12  1 7.3568e+10 334.1 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Visualize the pricing differences of low-cost operators across various distances
ggplot(flight_data, aes(x = distance, y = Fare, color = factor(Is_Low_Cost))) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(title = "The distance-Fare relationship of Low-cost operators vs. regular operators",
       color = "whether low-cost operator or not(0=no, 1=yes)")
## `geom_smooth()` using formula = 'y ~ x'

2.3 Number.Of.Stops and Total_Minutes

# linear regression model with interaction term 3
lm_interaction3 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                    Departure.Off.Peak + Arrival.Off.Peak+
                    Total_Minutes:Number.Of.Stops,  # an increase in the Number.Of.Stops on Total_Minutes affect Fare
                    data = flight_data)

# compare model with interaction3 with basic model
anova(lm_basic, lm_interaction3)
## Analysis of Variance Table
## 
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak + Total_Minutes:Number.Of.Stops
##   Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
## 1  27438 6.1152e+12                                   
## 2  27437 6.0974e+12  1 1.7716e+10 79.717 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analyze the impact of Total_Minutes on Fare under different Number.Of.Stops
ggplot(flight_data, aes(x = Total_Minutes, y = Fare, color = factor(Number.Of.Stops))) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(title = "Total_Minutes-Fare relationship under different Number.Of.Stops",
       color = "Number.Of.Stops")
## `geom_smooth()` using formula = 'y ~ x'

2.4 Departure.Off.Peak and Arrival.Off.Peak

# linear regression model with interaction term 4
lm_interaction4 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                    Departure.Off.Peak + Arrival.Off.Peak+
                    Departure.Off.Peak:Arrival.Off.Peak,  # The combined effect of whether departure and arrival times are during peak hours
                    data = flight_data)

# compare model with interaction4 with basic model
anova(lm_basic, lm_interaction4)
## Analysis of Variance Table
## 
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak + Departure.Off.Peak:Arrival.Off.Peak
##   Res.Df        RSS Df  Sum of Sq     F  Pr(>F)  
## 1  27438 6.1152e+12                              
## 2  27437 6.1137e+12  1 1464187478 6.571 0.01037 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.5 Total_Minutes and ifHoliday

# linear regression model with interaction term 5
lm_interaction5 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                    Departure.Off.Peak + Arrival.Off.Peak+
                    Total_Minutes:ifHoliday,  # Whether IfHoliday plays as moderator between totalMinutes and Fare
                    data = flight_data)

# compare model with interaction5 with basic model
anova(lm_basic, lm_interaction5)
## Analysis of Variance Table
## 
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak + Total_Minutes:ifHoliday
##   Res.Df        RSS Df Sum of Sq      F Pr(>F)
## 1  27438 6.1152e+12                           
## 2  27437 6.1148e+12  1 407926201 1.8304 0.1761

2.6 Total_Minutes and IsWeekend

# linear regression model with interaction term 6
lm_interaction6 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                    Departure.Off.Peak + Arrival.Off.Peak+
                    Total_Minutes:IsWeekend, # Whether IsWeekend plays as moderator totalMinutes and Fare
                    data = flight_data)

# compare model with interaction6 with basic model
anova(lm_basic, lm_interaction6)
## Analysis of Variance Table
## 
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak + Total_Minutes:IsWeekend
##   Res.Df        RSS Df Sum of Sq      F Pr(>F)
## 1  27438 6.1152e+12                           
## 2  27437 6.1149e+12  1 244793203 1.0984 0.2946

2.7 Departure.Off.Peak and Number.Of.Stops

# linear regression model with interaction term 7
lm_interaction7 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                    Departure.Off.Peak + Arrival.Off.Peak+
                    Departure.Off.Peak:Number.Of.Stops,  # passengers may tolerate more stops duting day flights compared to night flights 
                    data = flight_data)

# compare model with interaction7 with basic model
anova(lm_basic, lm_interaction7)
## Analysis of Variance Table
## 
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + 
##     ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak + Departure.Off.Peak:Number.Of.Stops
##   Res.Df        RSS Df Sum of Sq      F Pr(>F)
## 1  27438 6.1152e+12                           
## 2  27437 6.1150e+12  1 138755764 0.6226 0.4301

2.8 ANOVA comparison table

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)

# Define model names
model_names <- c("IsWeekend:ifHoliday", "Is_Low_Cost:distance", "Total_Minutes:Number.Of.Stops", 
                 "Departure.Off.Peak:Arrival.Off.Peak", "Total_Minutes:ifHoliday", "Total_Minutes:IsWeekend", 
                 "Departure.Off.Peak:Number.Of.Stops")

# Assume anova_list contains 7 ANOVA results
anova_list <- list(c("0.05712 .", 806583409), c("2.2e-16 ***", "7.3568e+10"), 
                   c("2.2e-16 ***", "1.7716e+10"), c("0.01037 *", 1464187478), 
                   c("0.1761", 407926201), c("0.2946", 244793203), c("0.4301", 138755764))

# Extract Sum of Squares and P-value, then create a data frame
anova_summary <- data.frame(
  Model = model_names,
  P_value = sapply(anova_list, function(x) x[1]),
  Sum_of_Sq = sapply(anova_list, function(x) x[2])
)

# Create the table
table_grob <- tableGrob(anova_summary, rows = NULL)

# Set header background color
header_bg <- gpar(fill = "#d6d6d6", col = "black", fontsize = 13, fontface = "bold")
for (i in seq_len(ncol(table_grob))) {
  table_grob$grobs[[i]]$gp <- header_bg
}

# Adjust font size
for (i in seq_along(table_grob$grobs)) {
  table_grob$grobs[[i]]$gp <- gpar(fontsize = 12)
}

# Save high-resolution image
png("anova_table_academic.png", width = 12, height = 4, units = "in", res = 400)
grid.draw(table_grob)
dev.off()
## quartz_off_screen 
##                 2
# Display the table
grid.draw(table_grob)

2.9 comprehensive model with effective interaction terms

2.9.1 linear regression

# construct a comprehensive linear regression model including important interaction terms
lm_important_interactions <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                    Departure.Off.Peak + Arrival.Off.Peak +
                    Is_Low_Cost:distance +  # interaction2
                    Total_Minutes:Number.Of.Stops +  # interaction3
                    Departure.Off.Peak:Arrival.Off.Peak,  # interaction4
                    data = flight_data)

# summary
summary(lm_important_interactions)
## 
## Call:
## lm(formula = Fare ~ Number.Of.Stops + Total_Minutes + distance + 
##     IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + 
##     Arrival.Off.Peak + Is_Low_Cost:distance + Total_Minutes:Number.Of.Stops + 
##     Departure.Off.Peak:Arrival.Off.Peak, data = flight_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -61972  -8648  -2540   4088  73270 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         -2.200e+02  5.874e+02  -0.375 0.707977    
## Number.Of.Stops                      5.511e+03  4.231e+02  13.025  < 2e-16 ***
## Total_Minutes                        1.122e+01  5.029e-01  22.306  < 2e-16 ***
## distance                             3.589e+00  3.373e-02 106.419  < 2e-16 ***
## IsWeekend                            1.356e+03  1.903e+02   7.128 1.04e-12 ***
## ifHoliday                           -1.622e+03  4.144e+02  -3.915 9.07e-05 ***
## Is_Low_Cost                          6.657e+03  1.081e+03   6.159 7.41e-10 ***
## Low_Cost_Count                      -5.092e+03  5.037e+02 -10.109  < 2e-16 ***
## Departure.Off.Peak                   1.241e+03  2.327e+02   5.331 9.82e-08 ***
## Arrival.Off.Peak                     3.637e+03  2.362e+02  15.396  < 2e-16 ***
## distance:Is_Low_Cost                -2.049e+00  1.044e-01 -19.621  < 2e-16 ***
## Number.Of.Stops:Total_Minutes       -3.642e+00  3.219e-01 -11.314  < 2e-16 ***
## Departure.Off.Peak:Arrival.Off.Peak -1.477e+03  4.320e+02  -3.419 0.000628 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14800 on 27435 degrees of freedom
## Multiple R-squared:  0.4708, Adjusted R-squared:  0.4706 
## F-statistic:  2034 on 12 and 27435 DF,  p-value: < 2.2e-16

2.9.2 bayesian regression

# construct a comprehensive bayesian regression model including important interaction terms
bayes_important_interactions <- brm(
  Fare ~ Number.Of.Stops + Total_Minutes + distance + 
  IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
  Departure.Off.Peak + Arrival.Off.Peak +
  Is_Low_Cost:distance +
  Total_Minutes:Number.Of.Stops +
  Departure.Off.Peak:Arrival.Off.Peak,
  data = flight_data,
  family = gaussian(),
  chains = 4,
  iter = 2000
)
## Compiling Stan program...
## Start sampling
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000109 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.09 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 14.614 seconds (Warm-up)
## Chain 1:                3.263 seconds (Sampling)
## Chain 1:                17.877 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 9.6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 9.671 seconds (Warm-up)
## Chain 2:                3.189 seconds (Sampling)
## Chain 2:                12.86 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 9.5e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.95 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 11.307 seconds (Warm-up)
## Chain 3:                3.187 seconds (Sampling)
## Chain 3:                14.494 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9.6e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 16.213 seconds (Warm-up)
## Chain 4:                3.079 seconds (Sampling)
## Chain 4:                19.292 seconds (Total)
## Chain 4:
# summary and bayesian R squared
summary(bayes_important_interactions)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + Arrival.Off.Peak + Is_Low_Cost:distance + Total_Minutes:Number.Of.Stops + Departure.Off.Peak:Arrival.Off.Peak 
##    Data: flight_data (Number of observations: 27448) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Regression Coefficients:
##                                     Estimate Est.Error l-95% CI u-95% CI Rhat
## Intercept                            -200.99    583.17 -1334.40   968.32 1.00
## Number.Of.Stops                      5501.72    413.03  4692.94  6298.47 1.00
## Total_Minutes                          11.20      0.50    10.23    12.18 1.00
## distance                                3.59      0.03     3.52     3.66 1.00
## IsWeekend                            1357.90    192.93   981.92  1740.77 1.00
## ifHoliday                           -1628.18    407.77 -2419.94  -823.82 1.00
## Is_Low_Cost                          6623.99   1075.15  4471.19  8698.11 1.00
## Low_Cost_Count                      -5079.30    501.16 -6019.01 -4067.60 1.00
## Departure.Off.Peak                   1244.03    230.99   786.98  1703.54 1.00
## Arrival.Off.Peak                     3641.36    235.73  3184.41  4095.07 1.00
## distance:Is_Low_Cost                   -2.05      0.11    -2.25    -1.84 1.00
## Number.Of.Stops:Total_Minutes          -3.63      0.31    -4.24    -3.01 1.00
## Departure.Off.Peak:Arrival.Off.Peak -1480.79    430.43 -2311.05  -603.31 1.00
##                                     Bulk_ESS Tail_ESS
## Intercept                               1551     2484
## Number.Of.Stops                         1646     2347
## Total_Minutes                           1819     2726
## distance                                4758     2706
## IsWeekend                               4669     2823
## ifHoliday                               4722     3090
## Is_Low_Cost                             1536     2506
## Low_Cost_Count                          1694     2625
## Departure.Off.Peak                      3741     3036
## Arrival.Off.Peak                        3689     2947
## distance:Is_Low_Cost                    4322     3005
## Number.Of.Stops:Total_Minutes           1533     2336
## Departure.Off.Peak:Arrival.Off.Peak     3090     2629
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 14803.63     63.98 14677.69 14936.09 1.00     7845     2295
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
bayes_R2(bayes_important_interactions)
##     Estimate   Est.Error      Q2.5     Q97.5
## R2 0.4707822 0.003218736 0.4644047 0.4769575

3. Cross validation & model improvement

3.1 10-fold cv with un-scaled data

# reproducibility
set.seed(123)

# define the model formula
model_formula <- Fare ~ Number.Of.Stops + Total_Minutes + distance + 
                IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
                Departure.Off.Peak + Arrival.Off.Peak +
                Is_Low_Cost:distance +
                Total_Minutes:Number.Of.Stops +
                Departure.Off.Peak:Arrival.Off.Peak

# define 10 fold cv
train_control <- trainControl(
  method = "cv",
  number = 10,  # 10-fold CV
  verboseIter = TRUE
)
# cv
cv_model <- train(
  model_formula,
  data = flight_data,
  method = "lm",
  trControl = train_control
)
## + Fold01: intercept=TRUE 
## - Fold01: intercept=TRUE 
## + Fold02: intercept=TRUE 
## - Fold02: intercept=TRUE 
## + Fold03: intercept=TRUE 
## - Fold03: intercept=TRUE 
## + Fold04: intercept=TRUE 
## - Fold04: intercept=TRUE 
## + Fold05: intercept=TRUE 
## - Fold05: intercept=TRUE 
## + Fold06: intercept=TRUE 
## - Fold06: intercept=TRUE 
## + Fold07: intercept=TRUE 
## - Fold07: intercept=TRUE 
## + Fold08: intercept=TRUE 
## - Fold08: intercept=TRUE 
## + Fold09: intercept=TRUE 
## - Fold09: intercept=TRUE 
## + Fold10: intercept=TRUE 
## - Fold10: intercept=TRUE 
## Aggregating results
## Fitting final model on full training set
# print the result
print(cv_model)
## Linear Regression 
## 
## 27448 samples
##     9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 24703, 24704, 24702, 24704, 24704, 24704, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   14804.66  0.4705339  10211.12
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
# get more detailed prediction metrics
print(cv_model$results)
##   intercept     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1      TRUE 14804.66 0.4705339 10211.12 298.8753 0.01832668 192.2191
# visualize the prediction results
# use the final model to predict 
predictions <- predict(cv_model, flight_data)

# scatter plot of prediction values vs. actual values
ggplot(data.frame(predicted = predictions, actual = flight_data$Fare), 
       aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "prediction values vs actual values",
    x = "actual fare",
    y = "prediction fare"
  )

cv_metrics <- cv_model$resample
print(cv_metrics)
##        RMSE  Rsquared       MAE Resample
## 1  14714.45 0.4753627 10216.176   Fold01
## 2  15017.53 0.4747368 10161.424   Fold02
## 3  14798.81 0.4785571 10081.270   Fold03
## 4  15112.10 0.4369495 10516.230   Fold04
## 5  14789.66 0.4785613 10258.759   Fold05
## 6  14809.83 0.4486708 10314.884   Fold06
## 7  14703.75 0.4881551 10044.551   Fold07
## 8  15153.71 0.4611778 10388.540   Fold08
## 9  14863.40 0.4641939 10293.771   Fold09
## 10 14083.40 0.4989744  9835.627   Fold10
# calculate Mean_RMSE/SD_RMSE/Mean_Rsquared/SD_RSquared
summary_metrics <- data.frame(
  Mean_RMSE = mean(cv_metrics$RMSE),
  SD_RMSE = sd(cv_metrics$RMSE),
  Mean_Rsquared = mean(cv_metrics$Rsquared),
  SD_Rsquared = sd(cv_metrics$Rsquared)
)
print(summary_metrics)
##   Mean_RMSE  SD_RMSE Mean_Rsquared SD_Rsquared
## 1  14804.66 298.8753     0.4705339  0.01832668

3.2 10-fold cv with scaled data

Why scale?

To make our RMSE range between 0-1 to evaluate the model more straightforward

3.2.1 Linear Model Cross Validation

# check the satistic values of "Fare"
mean_fare = mean(flight_data$Fare)
sd_fare = sd(flight_data$Fare)
cat("Fare's mean:", mean_fare, "\n")
## Fare's mean: 23769.17
cat("Fare's standard deviation:", sd_fare, "\n")
## Fare's standard deviation: 20344.6
# create data after scaling
flight_data_scaled <- flight_data %>%
  mutate(Fare_scaled = scale(Fare))

# use scaled data for cv
set.seed(123)

train_control <- trainControl(
  method = "cv",
  number = 10,
  verboseIter = TRUE
)

# use scaled data to create model
cv_model_scaled <- train(
  Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance + 
        IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
        Departure.Off.Peak + Arrival.Off.Peak +
        Is_Low_Cost:distance +
        Total_Minutes:Number.Of.Stops +
        Departure.Off.Peak:Arrival.Off.Peak,
  data = flight_data_scaled,
  method = "lm",
  trControl = train_control
)
## + Fold01: intercept=TRUE 
## - Fold01: intercept=TRUE 
## + Fold02: intercept=TRUE 
## - Fold02: intercept=TRUE 
## + Fold03: intercept=TRUE 
## - Fold03: intercept=TRUE 
## + Fold04: intercept=TRUE 
## - Fold04: intercept=TRUE 
## + Fold05: intercept=TRUE 
## - Fold05: intercept=TRUE 
## + Fold06: intercept=TRUE 
## - Fold06: intercept=TRUE 
## + Fold07: intercept=TRUE 
## - Fold07: intercept=TRUE 
## + Fold08: intercept=TRUE 
## - Fold08: intercept=TRUE 
## + Fold09: intercept=TRUE 
## - Fold09: intercept=TRUE 
## + Fold10: intercept=TRUE 
## - Fold10: intercept=TRUE 
## Aggregating results
## Fitting final model on full training set
print(cv_model_scaled)
## Linear Regression 
## 
## 27448 samples
##     9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 24703, 24704, 24702, 24704, 24704, 24703, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.7276648  0.4705238  0.5018736
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
# visualize prediction results
predictions_scaled <- predict(cv_model_scaled, flight_data_scaled)

ggplot(data.frame(predicted = predictions_scaled, 
                  actual = flight_data_scaled$Fare_scaled), 
       aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "predictive fare VS actual fare after scaling",
    x = "actual fare - scaled",
    y = "predictive fare - scaled"
  )

3.2.2 Bayesian Model Cross Validation

# Bayesian Model Cross Validation
# Set random seed
set.seed(123)

# Create 10-fold indices
folds <- createFolds(flight_data_scaled$Fare_scaled, k = 10, list = TRUE)

# Store prediction results for each fold
cv_results_bayes <- list()
rmse_values <- numeric(10)
r2_values <- numeric(10)

# Perform cross validation for each fold
for(i in 1:10) {
  # Split training and test sets
  test_indices <- folds[[i]]
  train_data <- flight_data_scaled[-test_indices, ]
  test_data <- flight_data_scaled[test_indices, ]
  
  # Train Bayesian model
  bayes_cv_model <- brm(
    Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance + 
    IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
    Departure.Off.Peak + Arrival.Off.Peak +
    Is_Low_Cost:distance +
    Total_Minutes:Number.Of.Stops +
    Departure.Off.Peak:Arrival.Off.Peak,
    data = train_data,
    family = gaussian(),
    chains = 4,
    iter = 2000,
    cores = 4,
    silent = 2
  )
  
  # Predict test set
  predictions <- predict(bayes_cv_model, newdata = test_data)
  
  # Calculate RMSE
  rmse_values[i] <- sqrt(mean((predictions[,"Estimate"] - test_data$Fare_scaled)^2))
  
  # Calculate R²
  r2_values[i] <- bayes_R2(bayes_cv_model)[,"Estimate"]
}
## Warning: There were 3364 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3314 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.38, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3295 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.21, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 2879 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: There were 2688 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3094 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.21, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3456 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.45, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3428 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.08, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 2541 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3124 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.18, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
# Calculate average metrics
bayes_cv_metrics <- data.frame(
  Mean_RMSE = mean(rmse_values),
  SD_RMSE = sd(rmse_values),
  Mean_Rsquared = mean(r2_values),
  SD_Rsquared = sd(r2_values)
)

# Print results
print("Bayesian Model Cross Validation Results:")
##                188.683 seconds (Total)
## Chain 4:
print(bayes_cv_metrics)
# Compare Bayesian CV results with previous results
cv_comparison <- data.frame(
  Model = c("Linear Model CV", "Bayesian Model CV"),
  RMSE = c(mean(cv_model_scaled$resample$RMSE), bayes_cv_metrics$Mean_RMSE),
  Rsquared = c(mean(cv_model_scaled$resample$Rsquared), bayes_cv_metrics$Mean_Rsquared)
)

print("Model Comparison:")
## [1] "Model Comparison:"
print(cv_comparison)
##               Model      RMSE  Rsquared
## 1   Linear Model CV 0.7276648 0.4705238
## 2 Bayesian Model CV 0.7276500 0.4708232
# Visualize comparison
boxplot(
  list(
    "Linear Model" = cv_model_scaled$resample$RMSE,
    "Bayesian Model" = rmse_values
  ),
  main = "RMSE Comparison",
  ylab = "RMSE"
)

3.2.3 lm VS bayesian comparison table

library(gridExtra)
library(grid)

row_labels <- c(expression(beta[1]), expression(beta[2]), expression(beta[3]),
                expression(beta[4]), expression(beta[5]), expression(beta[6]),
                expression(beta[7]), expression(beta[8]), expression(beta[9]),
                expression(gamma[1]), expression(gamma[2]), expression(gamma[3]),
                expression(beta[0]), "CV R²")

col_labels <- c("Linear Regression Model", "Bayesian Regression Model")

data <- matrix(c(
  "5.511e+03", "5511.41",
  "1.122e+01", "11.22",
  "3.589e+00", "3.59",
  "1.356e+03", "1359.77",
  "-1.622e+03", "-1630.33",
  "6.657e+03", "6652.42",
  "-5.092e+03", "-5091.12",
  "1.241e+03", "1242.18",
  "3.637e+03", "3637.24",
  "-2.049e+00", "-2.05",
  "-3.642e+00", "-3.64",
  "-1.477e+03", "-1482.39",
  "-2.200e+02", "-220.30",
  "0.4705238", "0.4709003"
), ncol = 2, byrow = TRUE)

df <- as.data.frame(data, stringsAsFactors = FALSE)
colnames(df) <- col_labels
rownames(df) <- sapply(row_labels, function(x) as.expression(x))

png("academic_table_truth.png", width = 1000, height = 700, res = 150)
grid.table(df, rows = row_labels, theme = ttheme_default(
  core = list(fg_params = list(cex = 1.0)),
  colhead = list(fg_params = list(fontface = "bold", cex = 1.1)),
  rowhead = list(fg_params = list(fontface = "bold", cex = 1.1))
))
dev.off()
## quartz_off_screen 
##                 2

3.3 model improvement

Why improve?

The RMSE is kind of big, meaning that our model cannot explain the variation very well.

# set seed and cv parameters
set.seed(123)
train_control <- trainControl(
  method = "cv",
  number = 10,
  verboseIter = TRUE
)

3.3.1 add non linear terms

# solution 1: add non linear terms
model_nonlinear <- train(
  Fare_scaled ~ Number.Of.Stops + Total_Minutes + I(Total_Minutes^2) + 
        distance + I(distance^2) +  # add squared terms
        IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
        Departure.Off.Peak + Arrival.Off.Peak +
        Is_Low_Cost:distance +
        Total_Minutes:Number.Of.Stops +
        Departure.Off.Peak:Arrival.Off.Peak,
  data = flight_data_scaled,
  method = "lm",
  trControl = train_control
)
## + Fold01: intercept=TRUE 
## - Fold01: intercept=TRUE 
## + Fold02: intercept=TRUE 
## - Fold02: intercept=TRUE 
## + Fold03: intercept=TRUE 
## - Fold03: intercept=TRUE 
## + Fold04: intercept=TRUE 
## - Fold04: intercept=TRUE 
## + Fold05: intercept=TRUE 
## - Fold05: intercept=TRUE 
## + Fold06: intercept=TRUE 
## - Fold06: intercept=TRUE 
## + Fold07: intercept=TRUE 
## - Fold07: intercept=TRUE 
## + Fold08: intercept=TRUE 
## - Fold08: intercept=TRUE 
## + Fold09: intercept=TRUE 
## - Fold09: intercept=TRUE 
## + Fold10: intercept=TRUE 
## - Fold10: intercept=TRUE 
## Aggregating results
## Fitting final model on full training set
print("Results of non-linear model: ")
## [1] "Results of non-linear model: "
print(model_nonlinear$results)
##   intercept      RMSE Rsquared       MAE     RMSESD RsquaredSD       MAESD
## 1      TRUE 0.6874839  0.52742 0.4724112 0.01164206 0.01275202 0.005514287

3.3.2 Random Forest

# Solution 2: Random Forest
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
model_rf <- train(
  Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance + 
        IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
        Departure.Off.Peak + Arrival.Off.Peak,
  data = flight_data_scaled,
  method = "rf",
  trControl = train_control,
  ntree = 500,
  importance = TRUE
)
## + Fold01: mtry=2 
## - Fold01: mtry=2 
## + Fold01: mtry=5 
## - Fold01: mtry=5 
## + Fold01: mtry=9 
## - Fold01: mtry=9 
## + Fold02: mtry=2 
## - Fold02: mtry=2 
## + Fold02: mtry=5 
## - Fold02: mtry=5 
## + Fold02: mtry=9 
## - Fold02: mtry=9 
## + Fold03: mtry=2 
## - Fold03: mtry=2 
## + Fold03: mtry=5 
## - Fold03: mtry=5 
## + Fold03: mtry=9 
## - Fold03: mtry=9 
## + Fold04: mtry=2 
## - Fold04: mtry=2 
## + Fold04: mtry=5 
## - Fold04: mtry=5 
## + Fold04: mtry=9 
## - Fold04: mtry=9 
## + Fold05: mtry=2 
## - Fold05: mtry=2 
## + Fold05: mtry=5 
## - Fold05: mtry=5 
## + Fold05: mtry=9 
## - Fold05: mtry=9 
## + Fold06: mtry=2 
## - Fold06: mtry=2 
## + Fold06: mtry=5 
## - Fold06: mtry=5 
## + Fold06: mtry=9 
## - Fold06: mtry=9 
## + Fold07: mtry=2 
## - Fold07: mtry=2 
## + Fold07: mtry=5 
## - Fold07: mtry=5 
## + Fold07: mtry=9 
## - Fold07: mtry=9 
## + Fold08: mtry=2 
## - Fold08: mtry=2 
## + Fold08: mtry=5 
## - Fold08: mtry=5 
## + Fold08: mtry=9 
## - Fold08: mtry=9 
## + Fold09: mtry=2 
## - Fold09: mtry=2 
## + Fold09: mtry=5 
## - Fold09: mtry=5 
## + Fold09: mtry=9 
## - Fold09: mtry=9 
## + Fold10: mtry=2 
## - Fold10: mtry=2 
## + Fold10: mtry=5 
## - Fold10: mtry=5 
## + Fold10: mtry=9 
## - Fold10: mtry=9 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 9 on full training set
print("Results of random forest: ")
## [1] "Results of random forest: "
print(model_rf$results)
##   mtry      RMSE  Rsquared       MAE      RMSESD RsquaredSD       MAESD
## 1    2 0.6445977 0.6011966 0.4479724 0.009084500 0.01176618 0.007406143
## 2    5 0.5471292 0.7007798 0.3576632 0.008795583 0.01069441 0.006313989
## 3    9 0.5338309 0.7152616 0.3447460 0.009309267 0.01041033 0.006670816
# check feature importance
importance_rf <- varImp(model_rf)
print("feature importance")
## [1] "feature importance"
print(importance_rf)
## rf variable importance
## 
##                    Overall
## distance           100.000
## Total_Minutes       34.386
## Number.Of.Stops     28.597
## Arrival.Off.Peak    20.352
## Departure.Off.Peak  19.863
## Low_Cost_Count      15.483
## Is_Low_Cost          6.500
## IsWeekend            5.392
## ifHoliday            0.000

3.3.3 Feature Engineering

# Solution 3: Feature Engineering
flight_data_engineered <- flight_data_scaled %>%
  mutate(
    # create new features
    peak_hours = ifelse(Departure.Off.Peak == 0 | Arrival.Off.Peak == 0, 1, 0),
    distance_per_minute = distance/Total_Minutes,
    total_cost_factor = Is_Low_Cost * Low_Cost_Count,
    weekend_holiday = IsWeekend * ifHoliday,
    stops_per_distance = Number.Of.Stops/distance
  )

model_engineered <- train(
  Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance + 
        IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
        Departure.Off.Peak + Arrival.Off.Peak +
        peak_hours + distance_per_minute + total_cost_factor + 
        weekend_holiday + stops_per_distance,
  data = flight_data_engineered,
  method = "lm",
  trControl = train_control
)
## + Fold01: intercept=TRUE 
## - Fold01: intercept=TRUE 
## + Fold02: intercept=TRUE 
## - Fold02: intercept=TRUE 
## + Fold03: intercept=TRUE 
## - Fold03: intercept=TRUE 
## + Fold04: intercept=TRUE 
## - Fold04: intercept=TRUE 
## + Fold05: intercept=TRUE 
## - Fold05: intercept=TRUE 
## + Fold06: intercept=TRUE 
## - Fold06: intercept=TRUE 
## + Fold07: intercept=TRUE 
## - Fold07: intercept=TRUE 
## + Fold08: intercept=TRUE 
## - Fold08: intercept=TRUE 
## + Fold09: intercept=TRUE 
## - Fold09: intercept=TRUE 
## + Fold10: intercept=TRUE 
## - Fold10: intercept=TRUE 
## Aggregating results
## Fitting final model on full training set
print("Results of feature engineering: ")
## [1] "Results of feature engineering: "
print(model_engineered$results)
##   intercept      RMSE  Rsquared       MAE     RMSESD RsquaredSD       MAESD
## 1      TRUE 0.7246163 0.4749353 0.5071564 0.01426998 0.01644719 0.006927153

3.3.4 Hierarchical Bayesian Model

# Solution 4:Hierarchical Bayesian Model

# create hierarchical Bayesian Model
bayes_model <- brm(
  Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance + 
  IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + 
  Departure.Off.Peak + Arrival.Off.Peak +
  Is_Low_Cost:distance +
  Total_Minutes:Number.Of.Stops +
  Departure.Off.Peak:Arrival.Off.Peak +
  # Modify the hierarchical structure to retain only one random effect
  (distance | Is_Low_Cost),  # Allow different distance effects for different airline types
  data = flight_data_scaled,
  family = gaussian(),
  chains = 4,
  iter = 2000,
  cores = 4
)
## Compiling Stan program...
## Start sampling
## Warning: There were 362 divergent transitions after warmup. See
## https://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.
## Warning: There were 3608 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: There were 3 chains where the estimated Bayesian Fraction of Missing Information was low. See
## https://mc-stan.org/misc/warnings.html#bfmi-low
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 4.31, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
summary(bayes_model)
## Warning: Parts of the model have not converged (some Rhats are > 1.05). Be
## careful when analysing the results! We recommend running more iterations and/or
## setting stronger priors.
## Warning: There were 362 divergent transitions after warmup. Increasing
## adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
predictions_bayes <- predict(bayes_model, flight_data_scaled)[,"Estimate"]

# Calculate Bayes R2
bayes_r2 <- bayes_R2(bayes_model)
print("bayes R²: ")
print(bayes_r2)
## 1411095 0.4637851 0.5118908

3.3.5 Compare four solutions

# compare the performance of three models
results_comparison <- data.frame(
  Model = c("non-linear model", "random forest", "feature engineering","hierarchical bayesian model"),
  RMSE = c(
    min(model_nonlinear$results$RMSE),
    min(model_rf$results$RMSE),
    min(model_engineered$results$RMSE),
    sqrt(mean((predictions_bayes - flight_data_scaled$Fare_scaled)^2))
  ),
  Rsquared = c(
    max(model_nonlinear$results$Rsquared),
    max(model_rf$results$Rsquared),
    max(model_engineered$results$Rsquared),
    mean(bayes_r2[,1])
  )
)

print("model comparison: ")
## [1] "model comparison: "
print(results_comparison)
##                         Model      RMSE  Rsquared
## 1            non-linear model 0.6874839 0.5274200
## 2               random forest 0.5338309 0.7152616
## 3         feature engineering 0.7246163 0.4749353
## 4 hierarchical bayesian model 0.7368088 0.4779538
# compare by visualization

# create prediction values for each model
predictions_nonlinear <- predict(model_nonlinear, flight_data_scaled)
predictions_rf <- predict(model_rf, flight_data_scaled)
predictions_engineered <- predict(model_engineered, flight_data_engineered)
predictions_bayes <- predict(bayes_model, flight_data_scaled)[,"Estimate"]

# create comparison plots of prediction values
library(ggplot2)
library(gridExtra)

p1 <- ggplot(data.frame(actual = flight_data_scaled$Fare_scaled, 
                        predicted = predictions_nonlinear), 
             aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "performance of non-linear model")

p2 <- ggplot(data.frame(actual = flight_data_scaled$Fare_scaled, 
                        predicted = predictions_rf), 
             aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "performance of random forest")

p3 <- ggplot(data.frame(actual = flight_data_scaled$Fare_scaled, 
                        predicted = predictions_engineered), 
             aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "performance of feature engineering")

p4 <- ggplot(data.frame(actual = flight_data_scaled$Fare_scaled, 
                        predicted = predictions_bayes), 
             aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Hierarchical Bayesian model prediction performance")

grid.arrange(p1, p2, p3,p4, ncol = 2)